# Set options for report generation
knitr::opts_chunk$set(
 fig.width = 6,
 fig.asp = 0.8,
 out.width = "80%"
)
# Deactivate scientific notation
options(scipen=999)
# Load packages
pacman::p_load(
  rlang, 
  an9elproject, 
  an9elversions, 
  magrittr, 
  tidyverse, 
  lubridate,
  naniar, 
  mice,
  survival, 
  survminer, 
  gridExtra, 
  # ggstance, 
  ggthemes,
  ggpval, 
  ggstatsplot, 
  # GGally, 
  install = FALSE, update = FALSE
  )

# source("http://peterhaschke.com/Code/multiplot.R") #load multiplot function
# Custom functions

# Run Shapiro-Wilks test on multiple variables
shapiro_test_summary <- function(data, columns) {
  results <- list()
  
  for (col in columns) {
    test_result <- shapiro.test(data[[col]])
    is_normal <- ifelse(test_result$p.value > 0.05, "Normally distributed", "Not normally distributed")
    
    results[[col]] <- list(
      p_value = test_result$p.value,
      normality = is_normal
    )
  }
  
  return(results)
}

# Run Kolmogorov-Smirnov test on multiple variables
lillie_test_summary <- function(data, columns) {
  results <- list()
  
  for (col in columns) {
    test_result <- nortest::lillie.test(data[[col]])
    is_normal <- ifelse(test_result$p.value > 0.05, "Normally distributed", "Not normally distributed")
    
    results[[col]] <- list(
      p_value = test_result$p.value,
      normality = is_normal
    )
  }
  
  return(results)
}

# Stacked density plot and boxplot 
stacked_density_plot_and_boxplot = function(data_df, variable_to_plot, plot_title) {
  
  outplot = ggplot(data = data_df, 
                    aes(x = {{variable_to_plot}}, 
                        y = -1)) + 
    geom_boxplot(outlier.colour = "red") + 
    stat_boxplot(geom = "errorbar") +
    geom_density(aes(x = {{variable_to_plot}}, 
                     y = stat(scaled)), 
                 fill = "lightgrey",
                 inherit.aes = FALSE) +
    #ylim(range(!!variable_to_plot, na.rm = TRUE)) +
    labs(title = plot_title, 
         x = "",
         y = "") +
    theme_hc(base_size = 8) + 
    theme(#axis.line.y = element_blank(),
          #axis.title.x = element_blank(),
          #axis.text.x = element_blank(),
          #axis.ticks.x = element_blank(), 
          plot.title = element_text(hjust = 0.5), 
          aspect.ratio = 3/5) + 
    coord_flip()
  
  return(outplot)
}

# Fancy barplot showing percentages 
fancy_barplot_percentage = function(dataframe, xcolname) {
  
  outplot = ggplot(dataframe, 
                   aes(x = {{xcolname}}, #
                       fill = {{xcolname}})) + # 
    geom_bar(aes(y = (..count..)/sum(..count..)), colour = "black") + 
    geom_text(aes(label = scales::percent((..count..)/sum(..count..)), 
                  y = (..count..)/sum(..count..)), 
              stat = "count", 
              vjust = -0.5) + 
    theme_bw() +
    theme(legend.position = "none", 
          axis.text.x = element_text(colour = "black", size = 10), 
          axis.text.y = element_text(colour = "black")) + 
    scale_y_continuous(labels = scales::percent, limits = c(0, 1)) + 
    # xlab(
    #   str_to_title(
    #     str_replace_all(
    #       substitute(xcolname), pattern = "_", replacement = " "))) + 
    ylab("%") +
    scale_fill_brewer(palette = "Set3")
    
  return(outplot)
}


# Fancy barplot for categorical variable with controls vs cases
fancy_barplot_ctrls_vs_cs = function (dataframe, xcolname) {
  
  outplot = ggplot(dataframe, aes(x = {{xcolname}}, fill = patient_group)) + 
  geom_bar(aes(y = (..count..)/sum(..count..)), 
           colour = "black", 
           position = "dodge") + 
    theme_bw() +
    labs(fill = "Experimental group") +
    theme(axis.text.x = element_text(colour = "black", size = 10), 
          axis.text.y = element_text(colour = "black")) + 
    scale_y_continuous(labels = scales::percent, limits = c(0, 1)) + 
    xlab(
      str_to_title(
        str_replace_all(
          substitute(xcolname), pattern = "_", replacement = " "))) + 
    ylab("%") +
    scale_fill_brewer(palette = "Set3")
  
  return(outplot)
}

Load data

# Load database
oncoth1 = get_project("oncothr1", version = "0.0.8003")
## ! Your oncothr1 project is version: 0.0.8003
## ! Last oncothr1 version online is: 0.0.8005
## ℹ Please, update an9elversions
# load("/mnt/ir-bioinf02/home/blobato/oncothromb01/oncothr1_versions/oncothr1.RData_0.0.8003")
# oncoth1 = obj
# rm(obj)
# Get data slot
oncoth1_data = oncoth1$data

Exploratory Data Analysis (EDA) of clinical factors

Descriptive table

# Transform some variables for better readability
oncoth1_data = oncoth1_data %>% 
  mutate(khorana_risk_score_three_categories = case_when(
    khorana_risk_score == 0 ~ "Low risk", 
    khorana_risk_score == 1 ~ "Intermediate risk",
    khorana_risk_score == 2 ~ "Intermediate risk",
    khorana_risk_score == 3 ~ "High risk",
    khorana_risk_score == 4 ~ "High risk",
    khorana_risk_score == 5 ~ "High risk",
    khorana_risk_score == NA ~ NA_character_), .after = khorana_risk_score) %>%
  mutate(khorana_risk_score_three_categories = factor(
    x = khorana_risk_score_three_categories, 
    levels = c("Low risk", "Intermediate risk", "High risk"), 
    labels = c("Low risk", "Intermediate risk", "High risk")
  ))
# Select variables
vars2select = c(
  "patient_group",
  "age_when_cancer_dx", 
  "gender", 
  "ethnicity",
  "bmi_category",
  "performance_status_category_corrected",
  "n_comorbidities",
  "arterial_hypertension",
  "chronic_cardiac_insufficiency",
  "diabetes_mellitus",
  "dyslipidemia",
  "tobacco_use",
  "copd",
  "renal_insufficiency", 
  "venous_insufficiency", 
  "hormone_therapy", 
  "antiaggreg_tx_currently",
  "previous_vte",
  "previous_ate",
  "family_background_vte",
  "khorana_risk_score_three_categories",
  "primary_tumor", 
  "tnm_stage_grouped",
  "grade_histological_differentiation",
  "tumor_surgically_removed",
  "catheter_device", 
  "major_surgery_in_medical_history",
  "n_times_recent_major_surgery_all_study",
  "n_times_major_trauma_all_study",
  "n_times_blood_transfusion_all_study", 
  "n_times_immobilisation_all_study",
  "n_times_erythropoietin_tx_all_study"
  )

oncoth1_subset = oncoth1_data %>%
  select(all_of(vars2select)) %>%
  mutate(across(c(
    n_times_recent_major_surgery_all_study,
    n_times_major_trauma_all_study,
    n_times_blood_transfusion_all_study, 
    n_times_immobilisation_all_study, 
    n_times_erythropoietin_tx_all_study), ~ as.factor(.x)))

# Make summary table
oncoth1_st = arsenal::tableby(patient_group ~., data = oncoth1_subset)
summary(oncoth1_st)
## 
## 
## |                                            |   Case (N=88)   | Control (N=302) |  Total (N=390)  | p value|
## |:-------------------------------------------|:---------------:|:---------------:|:---------------:|-------:|
## |**age_when_cancer_dx**                      |                 |                 |                 |   0.963|
## |&nbsp;&nbsp;&nbsp;Mean (SD)                 | 64.284 (11.226) | 64.225 (10.328) | 64.238 (10.522) |        |
## |&nbsp;&nbsp;&nbsp;Range                     | 42.000 - 86.000 | 33.000 - 93.000 | 33.000 - 93.000 |        |
## |**gender**                                  |                 |                 |                 |   0.224|
## |&nbsp;&nbsp;&nbsp;Male                      |   53 (60.2%)    |   203 (67.2%)   |   256 (65.6%)   |        |
## |&nbsp;&nbsp;&nbsp;Female                    |   35 (39.8%)    |   99 (32.8%)    |   134 (34.4%)   |        |
## |**ethnicity**                               |                 |                 |                 |   0.675|
## |&nbsp;&nbsp;&nbsp;Caucasian                 |   87 (98.9%)    |   291 (96.4%)   |   378 (96.9%)   |        |
## |&nbsp;&nbsp;&nbsp;Latin-American            |    1 (1.1%)     |    9 (3.0%)     |    10 (2.6%)    |        |
## |&nbsp;&nbsp;&nbsp;Black/African American    |    0 (0.0%)     |    1 (0.3%)     |    1 (0.3%)     |        |
## |&nbsp;&nbsp;&nbsp;Sephardic Jew             |    0 (0.0%)     |    1 (0.3%)     |    1 (0.3%)     |        |
## |**bmi_category**                            |                 |                 |                 |   0.986|
## |&nbsp;&nbsp;&nbsp;Underweight               |    3 (3.4%)     |    11 (3.6%)    |    14 (3.6%)    |        |
## |&nbsp;&nbsp;&nbsp;Normal                    |   43 (48.9%)    |   152 (50.3%)   |   195 (50.0%)   |        |
## |&nbsp;&nbsp;&nbsp;Overweight                |   30 (34.1%)    |   102 (33.8%)   |   132 (33.8%)   |        |
## |&nbsp;&nbsp;&nbsp;Obese                     |   12 (13.6%)    |   37 (12.3%)    |   49 (12.6%)    |        |
## |**performance_status_category_corrected**   |                 |                 |                 |   0.046|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |        1        |        7        |        8        |        |
## |&nbsp;&nbsp;&nbsp;0                         |   24 (27.6%)    |   121 (41.0%)   |   145 (38.0%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |   52 (59.8%)    |   152 (51.5%)   |   204 (53.4%)   |        |
## |&nbsp;&nbsp;&nbsp;2                         |   11 (12.6%)    |    22 (7.5%)    |    33 (8.6%)    |        |
## |**n_comorbidities**                         |                 |                 |                 |   0.992|
## |&nbsp;&nbsp;&nbsp;0                         |   28 (31.8%)    |   94 (31.1%)    |   122 (31.3%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |   27 (30.7%)    |   94 (31.1%)    |   121 (31.0%)   |        |
## |&nbsp;&nbsp;&nbsp;2+                        |   33 (37.5%)    |   114 (37.7%)   |   147 (37.7%)   |        |
## |**arterial_hypertension**                   |                 |                 |                 |   0.993|
## |&nbsp;&nbsp;&nbsp;No                        |   49 (55.7%)    |   168 (55.6%)   |   217 (55.6%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |   39 (44.3%)    |   134 (44.4%)   |   173 (44.4%)   |        |
## |**chronic_cardiac_insufficiency**           |                 |                 |                 |   0.336|
## |&nbsp;&nbsp;&nbsp;No                        |   87 (98.9%)    |   293 (97.0%)   |   380 (97.4%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    1 (1.1%)     |    9 (3.0%)     |    10 (2.6%)    |        |
## |**diabetes_mellitus**                       |                 |                 |                 |   0.253|
## |&nbsp;&nbsp;&nbsp;No                        |   75 (85.2%)    |   241 (79.8%)   |   316 (81.0%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |   13 (14.8%)    |   61 (20.2%)    |   74 (19.0%)    |        |
## |**dyslipidemia**                            |                 |                 |                 |   0.096|
## |&nbsp;&nbsp;&nbsp;No                        |   51 (58.0%)    |   204 (67.5%)   |   255 (65.4%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |   37 (42.0%)    |   98 (32.5%)    |   135 (34.6%)   |        |
## |**tobacco_use**                             |                 |                 |                 |   0.184|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |        0        |        4        |        4        |        |
## |&nbsp;&nbsp;&nbsp;Never has smoked          |   28 (31.8%)    |   87 (29.2%)    |   115 (29.8%)   |        |
## |&nbsp;&nbsp;&nbsp;Former smoker             |   35 (39.8%)    |   149 (50.0%)   |   184 (47.7%)   |        |
## |&nbsp;&nbsp;&nbsp;Active smoker             |   25 (28.4%)    |   62 (20.8%)    |   87 (22.5%)    |        |
## |**copd**                                    |                 |                 |                 |   0.139|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |        0        |        1        |        1        |        |
## |&nbsp;&nbsp;&nbsp;No                        |   80 (90.9%)    |   255 (84.7%)   |   335 (86.1%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    8 (9.1%)     |   46 (15.3%)    |   54 (13.9%)    |        |
## |**renal_insufficiency**                     |                 |                 |                 |   0.588|
## |&nbsp;&nbsp;&nbsp;No                        |   83 (94.3%)    |   289 (95.7%)   |   372 (95.4%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    5 (5.7%)     |    13 (4.3%)    |    18 (4.6%)    |        |
## |**venous_insufficiency**                    |                 |                 |                 |   0.493|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |        0        |        1        |        1        |        |
## |&nbsp;&nbsp;&nbsp;No                        |   79 (89.8%)    |   262 (87.0%)   |   341 (87.7%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    9 (10.2%)    |   39 (13.0%)    |   48 (12.3%)    |        |
## |**hormone_therapy**                         |                 |                 |                 |   0.890|
## |&nbsp;&nbsp;&nbsp;No                        |   87 (98.9%)    |   298 (98.7%)   |   385 (98.7%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    1 (1.1%)     |    4 (1.3%)     |    5 (1.3%)     |        |
## |**antiaggreg_tx_currently**                 |                 |                 |                 |   0.063|
## |&nbsp;&nbsp;&nbsp;No                        |   80 (90.9%)    |   250 (82.8%)   |   330 (84.6%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    8 (9.1%)     |   52 (17.2%)    |   60 (15.4%)    |        |
## |**previous_vte**                            |                 |                 |                 |   0.021|
## |&nbsp;&nbsp;&nbsp;No                        |   82 (93.2%)    |   296 (98.0%)   |   378 (96.9%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    6 (6.8%)     |    6 (2.0%)     |    12 (3.1%)    |        |
## |**previous_ate**                            |                 |                 |                 |   0.174|
## |&nbsp;&nbsp;&nbsp;No                        |   82 (93.2%)    |   266 (88.1%)   |   348 (89.2%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    6 (6.8%)     |   36 (11.9%)    |   42 (10.8%)    |        |
## |**family_background_vte**                   |                 |                 |                 |   0.090|
## |&nbsp;&nbsp;&nbsp;No                        |   81 (92.0%)    |   291 (96.4%)   |   372 (95.4%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    7 (8.0%)     |    11 (3.6%)    |    18 (4.6%)    |        |
## |**khorana_risk_score_three_categories**     |                 |                 |                 |   0.157|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |        0        |        1        |        1        |        |
## |&nbsp;&nbsp;&nbsp;Low risk                  |   19 (21.6%)    |   89 (29.6%)    |   108 (27.8%)   |        |
## |&nbsp;&nbsp;&nbsp;Intermediate risk         |   47 (53.4%)    |   160 (53.2%)   |   207 (53.2%)   |        |
## |&nbsp;&nbsp;&nbsp;High risk                 |   22 (25.0%)    |   52 (17.3%)    |   74 (19.0%)    |        |
## |**primary_tumor**                           |                 |                 |                 | < 0.001|
## |&nbsp;&nbsp;&nbsp;Colorectal cancer         |   28 (31.8%)    |   134 (44.4%)   |   162 (41.5%)   |        |
## |&nbsp;&nbsp;&nbsp;Esophageal cancer         |    2 (2.3%)     |    12 (4.0%)    |    14 (3.6%)    |        |
## |&nbsp;&nbsp;&nbsp;Gastric cancer            |   10 (11.4%)    |   45 (14.9%)    |   55 (14.1%)    |        |
## |&nbsp;&nbsp;&nbsp;NSCLC                     |   15 (17.0%)    |   72 (23.8%)    |   87 (22.3%)    |        |
## |&nbsp;&nbsp;&nbsp;Pancreatic cancer         |   33 (37.5%)    |   39 (12.9%)    |   72 (18.5%)    |        |
## |**tnm_stage_grouped**                       |                 |                 |                 |   0.005|
## |&nbsp;&nbsp;&nbsp;I-II                      |   11 (12.5%)    |   60 (19.9%)    |   71 (18.2%)    |        |
## |&nbsp;&nbsp;&nbsp;III                       |   23 (26.1%)    |   116 (38.4%)   |   139 (35.6%)   |        |
## |&nbsp;&nbsp;&nbsp;IV                        |   54 (61.4%)    |   126 (41.7%)   |   180 (46.2%)   |        |
## |**grade_histological_differentiation**      |                 |                 |                 |   0.735|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |       28        |       61        |       89        |        |
## |&nbsp;&nbsp;&nbsp;Well differentiated       |   10 (16.7%)    |   37 (15.4%)    |   47 (15.6%)    |        |
## |&nbsp;&nbsp;&nbsp;Moderately differentiated |   28 (46.7%)    |   126 (52.3%)   |   154 (51.2%)   |        |
## |&nbsp;&nbsp;&nbsp;Poorly differentiated     |   22 (36.7%)    |   78 (32.4%)    |   100 (33.2%)   |        |
## |**tumor_surgically_removed**                |                 |                 |                 |   0.026|
## |&nbsp;&nbsp;&nbsp;No                        |   70 (79.5%)    |   203 (67.2%)   |   273 (70.0%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |   18 (20.5%)    |   99 (32.8%)    |   117 (30.0%)   |        |
## |**catheter_device**                         |                 |                 |                 |   0.029|
## |&nbsp;&nbsp;&nbsp;N-Miss                    |        0        |        1        |        1        |        |
## |&nbsp;&nbsp;&nbsp;No                        |   46 (52.3%)    |   196 (65.1%)   |   242 (62.2%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |   42 (47.7%)    |   105 (34.9%)   |   147 (37.8%)   |        |
## |**major_surgery_in_medical_history**        |                 |                 |                 |   0.597|
## |&nbsp;&nbsp;&nbsp;No                        |   87 (98.9%)    |   296 (98.0%)   |   383 (98.2%)   |        |
## |&nbsp;&nbsp;&nbsp;Yes                       |    1 (1.1%)     |    6 (2.0%)     |    7 (1.8%)     |        |
## |**n_times_recent_major_surgery_all_study**  |                 |                 |                 |   0.585|
## |&nbsp;&nbsp;&nbsp;0                         |   66 (75.0%)    |   206 (68.2%)   |   272 (69.7%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |   19 (21.6%)    |   83 (27.5%)    |   102 (26.2%)   |        |
## |&nbsp;&nbsp;&nbsp;2                         |    2 (2.3%)     |    11 (3.6%)    |    13 (3.3%)    |        |
## |&nbsp;&nbsp;&nbsp;3                         |    1 (1.1%)     |    2 (0.7%)     |    3 (0.8%)     |        |
## |**n_times_major_trauma_all_study**          |                 |                 |                 |   0.589|
## |&nbsp;&nbsp;&nbsp;0                         |   88 (100.0%)   |   301 (99.7%)   |   389 (99.7%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |    0 (0.0%)     |    1 (0.3%)     |    1 (0.3%)     |        |
## |**n_times_blood_transfusion_all_study**     |                 |                 |                 |   0.004|
## |&nbsp;&nbsp;&nbsp;0                         |   55 (62.5%)    |   244 (80.8%)   |   299 (76.7%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |   26 (29.5%)    |   49 (16.2%)    |   75 (19.2%)    |        |
## |&nbsp;&nbsp;&nbsp;2                         |    5 (5.7%)     |    7 (2.3%)     |    12 (3.1%)    |        |
## |&nbsp;&nbsp;&nbsp;3                         |    2 (2.3%)     |    2 (0.7%)     |    4 (1.0%)     |        |
## |**n_times_immobilisation_all_study**        |                 |                 |                 |   0.580|
## |&nbsp;&nbsp;&nbsp;0                         |   66 (75.0%)    |   239 (79.1%)   |   305 (78.2%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |   19 (21.6%)    |   48 (15.9%)    |   67 (17.2%)    |        |
## |&nbsp;&nbsp;&nbsp;2                         |    3 (3.4%)     |    14 (4.6%)    |    17 (4.4%)    |        |
## |&nbsp;&nbsp;&nbsp;3                         |    0 (0.0%)     |    1 (0.3%)     |    1 (0.3%)     |        |
## |**n_times_erythropoietin_tx_all_study**     |                 |                 |                 |   0.116|
## |&nbsp;&nbsp;&nbsp;0                         |   82 (93.2%)    |   287 (95.0%)   |   369 (94.6%)   |        |
## |&nbsp;&nbsp;&nbsp;1                         |    5 (5.7%)     |    10 (3.3%)    |    15 (3.8%)    |        |
## |&nbsp;&nbsp;&nbsp;2                         |    0 (0.0%)     |    5 (1.7%)     |    5 (1.3%)     |        |
## |&nbsp;&nbsp;&nbsp;3                         |    1 (1.1%)     |    0 (0.0%)     |    1 (0.3%)     |        |
# Median and quantiles of age at cancer diagnosis

print("Quantiles of age at cancer diagnosis for all patients:")
## [1] "Quantiles of age at cancer diagnosis for all patients:"
quantile(oncoth1_data$age_when_cancer_dx)
##   0%  25%  50%  75% 100% 
##   33   57   64   72   93
cat("\n")
print("Quantiles of age at cancer diagnosis for patients with CAT:")
## [1] "Quantiles of age at cancer diagnosis for patients with CAT:"
oncoth1_data %>%
  filter(patient_group == "Case") %>%
  select(age_when_cancer_dx) %>%
  quantile(., na.rm = TRUE)
##    0%   25%   50%   75%  100% 
## 42.00 54.75 65.00 73.00 86.00
cat("\n")
print("Quantiles of age at cancer diagnosis for patients with CAT:")
## [1] "Quantiles of age at cancer diagnosis for patients with CAT:"
oncoth1_data %>%
  filter(patient_group == "Control") %>%
  select(age_when_cancer_dx) %>%
  quantile(., na.rm = TRUE)
##   0%  25%  50%  75% 100% 
##   33   58   64   71   93
cat("\n")

Check normality in continous variables

https://stats.stackexchange.com/questions/32168/how-important-is-it-to-transform-variable-for-cox-proportional-hazards

# Example usage with your specific data
columns_to_test <- c(
  "age_when_cancer_dx", 
  "weight", 
  "height", 
  "body_surface_area", 
  "bmi_value", 
  "albumin", 
  "aptt_ratio", 
  "bilirubin", 
  "creatinine", 
  "alkaline_phosphatase", 
  "hemoglobin", 
  "inr", 
  "leukocytes", 
  "platelets"
)
# Run Shapiro-Wilks normality test 
shapiro_results <- shapiro_test_summary(oncoth1_data, columns_to_test)

# Print the results
for (col in names(shapiro_results)) {
  print("Shapiro-Wilk normality test")
  cat(col, ":", shapiro_results[[col]]$normality, "(p-value =", shapiro_results[[col]]$p_value, ")\n")
}
## [1] "Shapiro-Wilk normality test"
## age_when_cancer_dx : Normally distributed (p-value = 0.1009268 )
## [1] "Shapiro-Wilk normality test"
## weight : Not normally distributed (p-value = 0.000001926424 )
## [1] "Shapiro-Wilk normality test"
## height : Not normally distributed (p-value = 0.00415426 )
## [1] "Shapiro-Wilk normality test"
## body_surface_area : Not normally distributed (p-value = 0.00002113358 )
## [1] "Shapiro-Wilk normality test"
## bmi_value : Not normally distributed (p-value = 0.00000000000359157 )
## [1] "Shapiro-Wilk normality test"
## albumin : Not normally distributed (p-value = 0.000009828937 )
## [1] "Shapiro-Wilk normality test"
## aptt_ratio : Not normally distributed (p-value = 0.000000000144287 )
## [1] "Shapiro-Wilk normality test"
## bilirubin : Not normally distributed (p-value = 0.00000000000000000000000000000000294055 )
## [1] "Shapiro-Wilk normality test"
## creatinine : Not normally distributed (p-value = 0.00000000007557108 )
## [1] "Shapiro-Wilk normality test"
## alkaline_phosphatase : Not normally distributed (p-value = 0.0000000000000000000000000000001510757 )
## [1] "Shapiro-Wilk normality test"
## hemoglobin : Normally distributed (p-value = 0.06060825 )
## [1] "Shapiro-Wilk normality test"
## inr : Not normally distributed (p-value = 0.0000000003900261 )
## [1] "Shapiro-Wilk normality test"
## leukocytes : Not normally distributed (p-value = 0.0000000000000000000007705813 )
## [1] "Shapiro-Wilk normality test"
## platelets : Not normally distributed (p-value = 0.000000000000003072663 )
# Run Shapiro-Wilks normality test 
lillie_results <- lillie_test_summary(oncoth1_data, columns_to_test)

# Print the results
for (col in names(lillie_results)) {
  print("Lilliefors (Kolmogorov-Smirnov) normality test")
  cat(col, ":", lillie_results[[col]]$normality, "(p-value =", lillie_results[[col]]$p_value, ")\n")
}
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## age_when_cancer_dx : Normally distributed (p-value = 0.1833302 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## weight : Not normally distributed (p-value = 0.0004022078 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## height : Not normally distributed (p-value = 0.0001375539 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## body_surface_area : Not normally distributed (p-value = 0.00000000003434639 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## bmi_value : Not normally distributed (p-value = 0.000000205658 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## albumin : Not normally distributed (p-value = 0.00000000001496203 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## aptt_ratio : Not normally distributed (p-value = 0.008137728 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## bilirubin : Not normally distributed (p-value = 0.0000000000000000000000000000000000000000000000000000000000000000000000000000008538166 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## creatinine : Not normally distributed (p-value = 0.000001242227 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## alkaline_phosphatase : Not normally distributed (p-value = 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002219109 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## hemoglobin : Not normally distributed (p-value = 0.002121832 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## inr : Not normally distributed (p-value = 0.0000000000003302228 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## leukocytes : Not normally distributed (p-value = 0.00000000000000000260768 )
## [1] "Lilliefors (Kolmogorov-Smirnov) normality test"
## platelets : Not normally distributed (p-value = 0.0000000000000002683373 )

Missing values’ proportion

print(paste("There are", n_var_miss(oncoth1_data), "variables with missing data."))
## [1] "There are 343 variables with missing data."
# Visual inspection of proportion of missing values
# Leave out dates and other columns with a lot of NAs for patients w/o VTE
oncoth1_data %>%
  select(!c(n_appointment_patient_became_case, 
            patient_left_study, 
            anticoag_tx_lmwh_dosage, 
            anticoag_tx_vte_drugs, 
            vte_before_entering_study)) %>%
  select(!contains("reason")) %>%
  select(!contains("date")) %>%
  select(!ends_with("_levels")) %>%
  select(!starts_with("other_thromb_")) %>%
  select(!starts_with("type_")) %>%
  select(!starts_with("cancer_concomitant_")) %>%
  select(!starts_with("eval_")) %>%
  select(!starts_with("rec_")) %>%
  select(!starts_with("tu_")) %>%
  select(!ends_with("_prior_study")) %>%
  select(!ends_with("_along_study")) %>%
  select(!ends_with("_recurrence")) %>%
  select_if( ~ any(is.na(.))) %>%
  gg_miss_var(., show_pct = TRUE)

# Visual inspection of proportion of missing values
# Leave out dates and other columns with a lot of NAs for patients w/o VTE
gg_miss_fct(
  oncoth1_data %>%
  select(!contains("reason")) %>%
  select(!contains("date")) %>%
  select(!ends_with("_levels")) %>%
  select(!starts_with("other_thromb_")) %>%
  select(!starts_with("type_")) %>%
  select(!starts_with("cancer_concomitant_")) %>%
  select(!starts_with("eval_")) %>%
  select(!starts_with("rec_")) %>%
  select(!starts_with("tu_")) %>%
  select(!ends_with("_prior_study")) %>%
  select(!ends_with("_along_study")) %>%
  select(!ends_with("_recurrence")),
  fct = patient_group)

Univariate plots

# Proportion of patient with VTE vs no VTE
table(oncoth1_data$patient_group, useNA = "ifany")
## 
##    Case Control 
##      88     302

https://aosmith.rbind.io/2019/05/13/small-multiples-plot/

hist_plot = ggplot(oncoth1_data, aes(x = age_when_cancer_dx)) + 
  geom_histogram(aes(y = (..count..)/sum(..count..)), 
                 breaks = seq(from = 25, to = 100, by = 5)) + 
  xlab("") + 
  ylab("Proportion") + 
  scale_x_continuous(breaks = seq(from = 20, to = 100, by = 10))

boxplot = ggplot(oncoth1_data, aes(x = age_when_cancer_dx)) + 
  geom_boxplot() + 
  theme(axis.text.y = element_blank(), 
        axis.ticks = element_blank(), 
        plot.margin = unit(c(0.0001, 0.25, 1, 1), "cm")) + 
  scale_x_continuous(breaks = seq(from = 20, to = 100, by = 10))


ggpubr::ggarrange(hist_plot, boxplot, 
                  ncol = 1, nrow = 2, 
                  #heights = c(1, 0.5),
                  align = "v")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Age and VTE
ggbetweenstats(
  data = oncoth1_data,
  x = patient_group,
  y = age_when_cancer_dx,
  xlab = "Patient group",
  ylab = "Age at cancer diagnosis",
  type = "barplot",
  messages = FALSE  # Set to TRUE to display additional messages
)

ggpiestats(
  data = oncoth1_data,
  x = gender,
  y = patient_group
)

ggpiestats(
  data = oncoth1_data,
  x = ethnicity,
  y = patient_group,
  type = "nonparametric"
)

ggpiestats(
  data = oncoth1_data,
  x = menopausal_status,
  y = patient_group,
  type = "nonparametric"
)

The 64.95% of the data in this variable (NAs) corresponds to men.

ggpiestats(
  data = oncoth1_data,
  x = bmi_category,
  y = patient_group,
  type = "nonparametric"
)

ggpiestats(
  data = oncoth1_data,
  x = performance_status_category,
  y = patient_group,
  type = "nonparametric"
)

# Personal background by VTE
ggpiestats(
  data = oncoth1_data,
  x = personal_background,
  y = patient_group,
  type = "nonparametric"
)

# Major surgery in medical history by VTE
ggpiestats(
  data = oncoth1_data,
  x = major_surgery_in_medical_history,
  y = patient_group,
  type = "nonparametric"
)

ggpiestats(
  data = oncoth1_data,
  x = hormone_therapy,
  y = patient_group,
  type = "nonparametric"
)

ggpiestats(
  data = oncoth1_data,
  x = pregnancy,
  y = patient_group,
  type = "nonparametric"
)

ggpiestats(
  data = oncoth1_data,
  x = oral_contraceptive_tx,
  y = patient_group,
  type = "nonparametric"
)

ggpiestats(
  data = oncoth1_data,
  x = diabetes_mellitus,
  y = patient_group,
  type = "nonparametric"
)

fancy_barplot_percentage(oncoth1_data, dyslipidemia)

fancy_barplot_percentage(oncoth1_data, tobacco_use)

fancy_barplot_percentage(oncoth1_data, copd)

fancy_barplot_percentage(oncoth1_data, arterial_hypertension)

fancy_barplot_percentage(oncoth1_data, chronic_cardiac_insufficiency)

fancy_barplot_percentage(oncoth1_data, renal_insufficiency)

fancy_barplot_percentage(oncoth1_data, previous_onco_surgery) + 
  xlab("Previous Oncologic Surgery")

fancy_barplot_percentage(oncoth1_data, primary_tumor) + 
  theme(axis.text.x = element_text(size = 9))

fancy_barplot_percentage(oncoth1_data, tumor_surgically_removed)

fancy_barplot_percentage(oncoth1_data, progression_according_to_clinical_stage) + 
  theme(axis.text.x = element_blank(), 
        axis.ticks.x = element_blank(), 
        legend.position = c(0.5, 0.8), 
        legend.title = element_blank(), 
        legend.background = element_blank(),
        legend.text=element_text(size = 11))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fancy_barplot_percentage(oncoth1_data, tnm_stage) 

fancy_barplot_percentage(oncoth1_data, t_stage)

fancy_barplot_percentage(oncoth1_data, n_stage)

fancy_barplot_percentage(oncoth1_data, m_stage)

fancy_barplot_percentage(oncoth1_data, histology_type)

fancy_barplot_percentage(oncoth1_data, mucinous_histology)

fancy_barplot_percentage(oncoth1_data, grade_histological_differentiation)

fancy_barplot_percentage(oncoth1_data, catheter_device)

fancy_barplot_percentage(oncoth1_data, venous_insufficiency)

fancy_barplot_percentage(oncoth1_data, family_background_vte_all_relatives) +
  xlab("Family background - VTE") + 
  scale_x_discrete(limits = c("No relatives", 
                              "Father", 
                              "Mother", 
                              "Siblings",
                              "Others",
                              "Offspring", 
                              "Two relatives"))

fancy_barplot_percentage(oncoth1_data, previous_vte)

fancy_barplot_percentage(oncoth1_data, previous_ate_type) +
  xlab("Type of Previous ATE") + 
  scale_x_discrete(limits = c("No ATE", 
                              "Myocardial infarction", 
                              "CVA", 
                              "PAD", 
                              "TIA", 
                              "Other"))
## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_count()`).
## Removed 4 rows containing non-finite outside the scale range (`stat_count()`).

fancy_barplot_percentage(oncoth1_data, anticoag_tx_vte) + 
  xlab("Anticoagulant treatment due to previous VTE")

fancy_barplot_percentage(oncoth1_data, anticoag_tx_reason) + 
  xlab("Reason of prescription of anticoagulants") + 
  scale_x_discrete(limits = c("VTE", 
                              "Cardiopathy", 
                              "Others", 
                              NA))

fancy_barplot_percentage(oncoth1_data, anticoag_tx_vte_drugs) + 
  xlab("Drugs used in anticoagulant treatments") + 
  scale_x_discrete(limits = c("LMWH", 
                              "Vit K antagonists", 
                              "Dabigatran", 
                              NA))

fancy_barplot_percentage(oncoth1_data, anticoag_tx_currently)

fancy_barplot_percentage(oncoth1_data, cancer_diagnosed_due_to_thrombosis)

fancy_barplot_percentage(oncoth1_data, vte_dx_during_cancer_follow_up)

stacked_density_plot_and_boxplot(oncoth1_data, height, "Height")
## Warning: `stat(scaled)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(scaled)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Bivariate plots (no VTE vs VTE)

Use Phi coefficient to see correlation between two categorical variables, both of them adopting binary values. psych::phi()

In case the categorical variables have more than 2 unique values, use Crammer’s V. questionr::cramer.v()

#categorical_variables = c(variable.names(Filter(f = is.character, x = oncoth1_data)), 
                          #variable.names(Filter(f = is.factor, x = oncoth1_data)))

categorical_variables = c(
  "gender", 
  "ethnicity", 
  "menopausal_status", 
  "bmi_category", 
  "performance_status_category", 
  "leukocytosis", 
  "thrombocytosis", 
  "low_hemoglobin", 
  "personal_background", 
  # "other_personal_background", 
  "major_surgery_in_medical_history", 
  "hormone_therapy", 
  "pregnancy", 
  "oral_contraceptive_tx", 
  "diabetes_mellitus", 
  "dyslipidemia", 
  "tobacco_use", 
  "copd", 
  "arterial_hypertension", 
  "chronic_cardiac_insufficiency", 
  "renal_insufficiency", 
  "previous_onco_surgery", 
  "primary_tumor_simplified", 
  "tumor_surgically_removed", 
  "progression_according_to_clinical_stage", 
  "tnm_stage", 
  "tnm_stage_detailed", 
  "tnm_stage_grouped", 
  "t_stage", 
  "n_stage", 
  "m_stage", 
  "histology_type", 
  "mucinous_histology", 
  "grade_histological_differentiation", 
  "metastasis_dx", 
  "metastasis_adrenal_glands", 
  "metastasis_bones",                             
  "metastasis_brain",                               
  "metastasis_liver",                               
  "metastasis_lung",                                
  "metastasis_lymph_nodes",                         
  "metastasis_peritoneum",                          
  "metastasis_pleura",                              
  "metastasis_skin",                                
  "metastasis_soft_tissues",                        
  "other_metastases",
  "catheter_device", 
  "krs_category",
  "two_groups_krs", 
  "tic_onco", 
  "venous_insufficiency", 
  "family_background_VTE_father",    
  "family_background_VTE_mother",  
  "family_background_VTE_siblings", 
  "family_background_VTE_offspring", 
  "family_background_VTE_other_relatives", 
  "previous_vte", 
  "previous_ate",
  "cva_ate",
  "myocardial_infarction_ate", 
  "pad_ate", 
  "tia_ate", 
  "other_thromb_ate", 
  "anticoag_tx_vte",                                
  "anticoag_tx_vte_drugs",                          
  "anticoag_tx_cardiopathy",
  "anticoag_tx_other_causes", 
  "anticoag_tx_reason", 
  "anticoag_tx_apixaban",                           
  "anticoag_tx_dabigatran",                         
  "anticoag_tx_lmwh",                               
  "anticoag_tx_lmwh_dosage",                        
  "anticoag_tx_vit_k_antag",                        
  "anticoag_tx_other_drugs",                        
  "anticoag_tx_currently",                          
  "antiaggreg_tx_cardiopathy",                      
  "antiaggreg_tx_cva",                              
  "antiaggreg_tx_other_causes",
  "antiaggreg_tx_currently", 
  "vte_before_entering_study", 
  "cancer_associated_crt_vte", 
  "cancer_associated_ledvt_vte", 
  "cancer_associated_uedvt_vte", 
  "cancer_associated_pte_vte", 
  "cancer_associated_svt_vte", 
  "cancer_associated_visceral_thromb_vte", 
  "cancer_associated_other_thromb_vte"
)
# Empty list to save ggbarstats plots
ggbarstats_plots = list()

for (variable in categorical_variables[1:79]) { 
  plot_i = ggbarstats(data = oncoth1_data,
                        x = patient_group,
                        y = !!variable, 
                        type = "nonparametric") +
             labs(caption = NULL)
  
  # Print plot
  print(plot_i)
  
  # Save plot in list
  ggbarstats_plots[[variable]] = plot_i
}

Correlation plot of comorbidities

# Get comorbidities features
comordibities = oncoth1_data %>%
  select(
    diabetes_mellitus, 
    dyslipidemia, 
    copd, 
    arterial_hypertension, 
    chronic_cardiac_insufficiency,
    renal_insufficiency, 
    venous_insufficiency
    ) %>%
  mutate_all( ~ ifelse(.x == "Yes", 1, 0)) %>%
  drop_na() %>% 
  as.matrix()
# Correlation plot for comorbidities
psych::corPlot(
  comordibities, 
  show.legend = TRUE, 
  symmetric = TRUE
  )

  • Diabetes mellitus is positively correlated with dyslipidemia and, in a lesser degree, with arterial hypertension and cardiac insufficiency.
  • Chronic cardiac insufficiency is positively correlated with renal insufficiency and, in a lesser extent, with venous insufficiency.